Divgrad

计算逐元素除法操作 (Y = x1 / x2) 的梯度。

\[dx_1 = \frac{\partial L}{\partial x_1} = \frac{\partial L}{\partial Y} \cdot \frac{1}{x_2} = \frac{dy}{x_2}\]
\[dx_2 = \frac{\partial L}{\partial x_2} = \frac{\partial L}{\partial Y} \cdot \frac{-x_1}{x_2^2} = - \frac{dy \cdot x_1}{x_2^2}\]

Divgrad1l版本专门用于 x1 张量维度大于或等于 x2 张量的广播场景。Divgrad2l版本专门用于 x2 张量维度大于或等于 x1 张量的广播场景。

输入:
  • dy - 来自后一层的上游梯度张量。

  • x1 - 前向传播时的第一个输入张量(被除数)。

  • x2 - 前向传播时的第二个输入张量(除数)。

  • params - 参数打包成结构体。

  • large_shape - x1x2 中维度较大的张量的形状。

  • small_shape - x1x2 中维度较小的张量的形状。

  • out_shape - 输出张量 dx1dx2 的形状。

  • ndims - 张量的维度数。

  • large_strides - 维度较大张量的步长信息。

  • small_strides - 维度较小张量的步长信息。

  • out_strides - 输出张量的步长信息。

  • large_multiples - 维度较大张量的广播倍数。

  • small_multiples - 维度较小张量的广播倍数。

  • tile_data0 - 临时工作空间地址。

  • tile_data1 - 临时工作空间地址。

  • tile_data2 - 临时工作空间地址。

  • indices - 用于广播计算的临时索引空间地址。

  • core_mask - 核掩码。

输出:
  • dx1 - 写入计算出的对 x1 的梯度。

  • dx2 - 写入计算出的对 x2 的梯度。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持fp32

  • MT7004 支持fp16, fp32

参数结构体:

} Parameter;

共享存储版本:

void fp_div_grad_s(float *dy, float *dx1, float *dx2, float *x1_data, float *x2_data, Parameter *params, int core_mask);
void hp_div_grad_s(float16 *dy, float16 *dx1, float16 *dx2, float16 *x1_data, float16 *x2_data, Parameter *params, int core_mask);

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3#include <divgrad.h>
 4int main(int argc, char* argv[]) {
 5    float *dy = (float *)0x81000000;//输入,初始化
 6    float *dx1 = (float *)0x82000000;//输出,需要初始化
 7    float *dx2 = (float *)0x83000000;//输出,需要初始化
 8    float *x1_data = (float *)0x84000000;//输入,需要初始化
 9    float *x2_data = (float *)0x85000000;//输入,需要初始化
10    float *tile_data0 = (float *)0x86000000;//中间结果,不需要初始化
11    float *tile_data1 = (float *)0x87000000;//中间结果,不需要初始化
12    float *tile_data2 = (float *)0x88000000;//中间结果,不需要初始化
13    float *check_dx1 = (float *)0x89000000;//输出,需要初始化
14    float *check_dx2 = (float *)0x8A000000;//输出,需要初始化
15
16    long long ndims = 4;
17    long long dy_size;
18    long long x1_size;
19    long long x2_size;
20
21    int *large_strides = (int *)0x8B000000;//不需要初始化
22    int *small_strides = (int *)0x8B100000; //不需要初始化
23    int *out_strides = (int *)0x8B200000; //不需要初始化
24    int *large_multiples = (int *)0x8B300000; //不需要初始化
25    int *small_multiples = (int *)0x8B400000; //不需要初始化
26    int *indices = (int *)0x8B500000;
27    int *x1_shape = (int *)0x8B600000;
28    int *x2_shape = (int *)0x8B700000;
29
30    int i = 0;
31    srand(seed++);
32
33    //初始化
34    x1_shape[0] = 4;  x1_shape[1] = 4;  x1_shape[2] = 4;  x1_shape[3] = 4;
35    x2_shape[0] = 4;  x2_shape[1] = 4;  x2_shape[2] = 4;  x2_shape[3] = 1;
36
37    int *large_shape = x1_shape;
38    int *small_shape = x2_shape;
39    int *output_shape = large_shape;
40
41    dy_size = output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3];
42    x1_size = x1_shape[0] * x1_shape[1] * x1_shape[2] * x1_shape[3];
43    x2_size = x2_shape[0] * x2_shape[1] * x2_shape[2] * x2_shape[3];
44
45    for(i = 0; i < dy_size; ++i) {
46        dy[i] = (float)(rand()%1000)/100 + 1.0f;
47    }
48
49    for(i = 0; i < x1_size; ++i) {
50        x1_data[i] = (float)(rand()%1000)/100 + 1.0f;
51    }
52
53    for(i = 0; i < x2_size; ++i) {
54        x2_data[i] = (float)(rand()%1000)/100 + 1.0f;
55    }
56
57    memset(indices, 0, ndims*sizeof(int));
58
59    Parameter params;
60
61    params.tile_data0 = tile_data0; //5
62    params.tile_data1 = tile_data1; //6
63    params.tile_data2 = tile_data2; //7
64    params.large_shape = large_shape;
65    params.small_shape = small_shape;
66    params.out_shape = output_shape;
67    params.ndims = ndims;
68    params.dy_size = dy_size;
69    params.x1_size = x1_size;
70    params.x2_size = x2_size;
71    params.large_strides = large_strides;
72    params.small_strides = small_strides;
73    params.out_strides = out_strides;
74    params.large_multiples = large_multiples;
75    params.small_multiples = small_multiples;
76    params.indices = indices;
77    params.x1_shape = x1_shape;
78    params.x2_shape = x2_shape;
79
80    fp_div_grad_s(dy, dx1, dx2, x1_data, x2_data, &params, core_mask);
81}

私有存储版本:

void fp_div_grad_p(float *dy, float *dx1, float *dx2, float *x1_data, float *x2_data, Parameter *params);
void hp_div_grad_p(float16 *dy, float16 *dx1, float16 *dx2, float16 *x1_data, float16 *x2_data, Parameter *params);

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3#include <divgrad.h>
 4int main(int argc, char* argv[]) {
 5    float *dy = (float *)0x10010000;//输入,初始化
 6    float *dx1 = (float *)0x10016000;//输出,需要初始化
 7    float *dx2 = (float *)0x10020000;//输出,需要初始化
 8    float *x1_data = (float *)0x10026000;//输入,需要初始化
 9    float *x2_data = (float *)0x10030000;//输入,需要初始化
10    float *tile_data0 = (float *)0x10036000;//中间结果,不需要初始化
11    float *tile_data1 = (float *)0x10040000;//中间结果,不需要初始化
12    float *tile_data2 = (float *)0x10046000;//中间结果,不需要初始化
13    int *x1_shape = (int *)0x10050000;
14    int *x2_shape = (int *)0x10051000;
15
16    long long ndims = 4;
17    long long dy_size;
18    long long x1_size;
19    long long x2_size;
20
21    int *large_strides = (int *)0x10053000;//不需要初始化
22    int *small_strides = (int *)0x10054000; //不需要初始化
23    int *out_strides = (int *)0x10055000; //不需要初始化
24    int *large_multiples = (int *)0x10056000; //不需要初始化
25    int *small_multiples = (int *)0x10057000; //不需要初始化
26    int *indices = (int *)0x10058000;
27
28    int i = 0;
29    srand(seed++);
30
31    //初始化
32    x1_shape[0] = 4;  x1_shape[1] = 4;  x1_shape[2] = 4;  x1_shape[3] = 4;
33    x2_shape[0] = 4;  x2_shape[1] = 4;  x2_shape[2] = 4;  x2_shape[3] = 1;
34
35    int *large_shape = x1_shape;
36    int *small_shape = x2_shape;
37    int *output_shape = large_shape;
38
39    dy_size = output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3];
40    x1_size = x1_shape[0] * x1_shape[1] * x1_shape[2] * x1_shape[3];
41    x2_size = x2_shape[0] * x2_shape[1] * x2_shape[2] * x2_shape[3];
42
43    for(i = 0; i < dy_size; ++i) {
44        dy[i] = (float)(rand()%1000)/100 + 1.0f;
45    }
46
47    for(i = 0; i < x1_size; ++i) {
48        x1_data[i] = (float)(rand()%1000)/100 + 1.0f;
49    }
50
51    for(i = 0; i < x2_size; ++i) {
52        x2_data[i] = (float)(rand()%1000)/100 + 1.0f;
53    }
54
55    memset(indices, 0, ndims*sizeof(int));
56
57    Parameter params;
58
59    params.tile_data0 = tile_data0; //5
60    params.tile_data1 = tile_data1; //6
61    params.tile_data2 = tile_data2; //7
62    params.large_shape = large_shape;
63    params.small_shape = small_shape;
64    params.out_shape = output_shape;
65    params.ndims = ndims;
66    params.dy_size = dy_size;
67    params.x1_size = x1_size;
68    params.x2_size = x2_size;
69    params.large_strides = large_strides;
70    params.small_strides = small_strides;
71    params.out_strides = out_strides;
72    params.large_multiples = large_multiples;
73    params.small_multiples = small_multiples;
74    params.indices = indices;
75    params.x1_shape = x1_shape;
76    params.x2_shape = x2_shape;
77
78    fp_div_grad_p(dy, dx1, dx2, x1_data, x2_data, &params);
79}